[
  {
    "title": "Zero-Shot Video Moment Retrieval from Frozen Vision-Language Models",
    "base_url": "https://openaccess.thecvf.com/content/WACV2024",
    "title_page": "/html/Luo_Zero-Shot_Video_Moment_Retrieval_From_Frozen_Vision-Language_Models_WACV_2024_paper.html",
    "github": null,
    "web_page": null,
    "github_page": null,
    "colab": null,
    "modelscope": null,
    "gitee": null,
    "gitlab": null,
    "zenodo": null,
    "kaggle": null,
    "demo_page": null,
    "paper_thecvf": "/papers/Luo_Zero-Shot_Video_Moment_Retrieval_From_Frozen_Vision-Language_Models_WACV_2024_paper.pdf",
    "paper_arxiv_id": "2309.00661",
    "paper_pdf": null,
    "paper_hal_science": null,
    "paper_researchgate": null,
    "paper_amazon": null,
    "youtube_id": "lOkj4h4_0Ic",
    "drive_google": null,
    "dropbox": null,
    "onedrive": null,
    "loom": null,
    "section": "Vision + Language and/or Other Modalities"
  },
  {
    "title": "Investigating the Role of Attribute Context in Vision-Language Models for Object Recognition and Detection",
    "base_url": "https://openaccess.thecvf.com/content/WACV2024",
    "title_page": "/html/Buettner_Investigating_the_Role_of_Attribute_Context_in_Vision-Language_Models_for_WACV_2024_paper.html",
    "github": "krbuettner/attributes_and_vlms",
    "web_page": null,
    "github_page": null,
    "colab": null,
    "modelscope": null,
    "gitee": null,
    "gitlab": null,
    "zenodo": null,
    "kaggle": null,
    "demo_page": null,
    "paper_thecvf": "/papers/Buettner_Investigating_the_Role_of_Attribute_Context_in_Vision-Language_Models_for_WACV_2024_paper.pdf",
    "paper_arxiv_id": "2303.10093",
    "paper_pdf": null,
    "paper_hal_science": null,
    "paper_researchgate": null,
    "paper_amazon": null,
    "youtube_id": null,
    "drive_google": null,
    "dropbox": null,
    "onedrive": null,
    "loom": null,
    "section": "Vision + Language and/or Other Modalities"
  },
  {
    "title": "Benchmarking Out-of-Distribution Detection in Visual Question Answering",
    "base_url": "https://openaccess.thecvf.com/content/WACV2024",
    "title_page": "/html/Shi_Benchmarking_Out-of-Distribution_Detection_in_Visual_Question_Answering_WACV_2024_paper.html",
    "github": "Sxx1995/Benchmarking-Out-of-Distribution-Detection-in-Visual-Question-Answering",
    "web_page": null,
    "github_page": null,
    "colab": null,
    "modelscope": null,
    "gitee": null,
    "gitlab": null,
    "zenodo": null,
    "kaggle": null,
    "demo_page": null,
    "paper_thecvf": "/papers/Shi_Benchmarking_Out-of-Distribution_Detection_in_Visual_Question_Answering_WACV_2024_paper.pdf",
    "paper_arxiv_id": null,
    "paper_pdf": null,
    "paper_hal_science": null,
    "paper_researchgate": null,
    "paper_amazon": null,
    "youtube_id": "Pj23xLYGt-Y",
    "drive_google": null,
    "dropbox": null,
    "onedrive": null,
    "loom": null,
    "section": "Vision + Language and/or Other Modalities"
  },
  {
    "title": "Sound3DVDet: 3D Sound Source Detection using Multiview Microphone Array and RGB Images",
    "base_url": "https://openaccess.thecvf.com/content/WACV2024",
    "title_page": "/html/He_Sound3DVDet_3D_Sound_Source_Detection_Using_Multiview_Microphone_Array_and_WACV_2024_paper.html",
    "github": "yuhanghe01/Sound3DVDet",
    "web_page": null,
    "github_page": null,
    "colab": null,
    "modelscope": null,
    "gitee": null,
    "gitlab": null,
    "zenodo": null,
    "kaggle": null,
    "demo_page": null,
    "paper_thecvf": "/papers/He_Sound3DVDet_3D_Sound_Source_Detection_Using_Multiview_Microphone_Array_and_WACV_2024_paper.pdf",
    "paper_arxiv_id": null,
    "paper_pdf": null,
    "paper_hal_science": null,
    "paper_researchgate": null,
    "paper_amazon": null,
    "youtube_id": "kyRcuBrEcW0",
    "drive_google": null,
    "dropbox": null,
    "onedrive": null,
    "loom": null,
    "section": "Vision + Language and/or Other Modalities"
  },
  {
    "title": "LAVSS: Location-Guided Audio-Visual Spatial Audio Separation",
    "base_url": "https://openaccess.thecvf.com/content/WACV2024",
    "title_page": "/html/Ye_LAVSS_Location-Guided_Audio-Visual_Spatial_Audio_Separation_WACV_2024_paper.html",
    "github": "YYX666660/LAVSS",
    "web_page": null,
    "github_page": "https://yyx666660.github.io/LAVSS/",
    "colab": null,
    "modelscope": null,
    "gitee": null,
    "gitlab": null,
    "zenodo": null,
    "kaggle": null,
    "demo_page": null,
    "paper_thecvf": "/papers/Ye_LAVSS_Location-Guided_Audio-Visual_Spatial_Audio_Separation_WACV_2024_paper.pdf",
    "paper_arxiv_id": "2310.20446",
    "paper_pdf": null,
    "paper_hal_science": null,
    "paper_researchgate": null,
    "paper_amazon": null,
    "youtube_id": "ux8aX6vddDw",
    "drive_google": null,
    "dropbox": null,
    "onedrive": null,
    "loom": null,
    "section": "Vision + Language and/or Other Modalities"
  },
  {
    "title": "Augment the Pairs: Semantics-Preserving Image-Caption Pair Augmentation for Grounding-based Vision and Language Models",
    "base_url": "https://openaccess.thecvf.com/content/WACV2024",
    "title_page": "/html/Yi_Augment_the_Pairs_Semantics-Preserving_Image-Caption_Pair_Augmentation_for_Grounding-Based_Vision_WACV_2024_paper.html",
    "github": "amzn/augment-the-pairs-wacv2024",
    "web_page": null,
    "github_page": null,
    "colab": null,
    "modelscope": null,
    "gitee": null,
    "gitlab": null,
    "zenodo": null,
    "kaggle": null,
    "demo_page": null,
    "paper_thecvf": "/papers/Yi_Augment_the_Pairs_Semantics-Preserving_Image-Caption_Pair_Augmentation_for_Grounding-Based_Vision_WACV_2024_paper.pdf",
    "paper_arxiv_id": "2311.02536",
    "paper_pdf": null,
    "paper_hal_science": null,
    "paper_researchgate": null,
    "paper_amazon": null,
    "youtube_id": null,
    "drive_google": null,
    "dropbox": null,
    "onedrive": null,
    "loom": null,
    "section": "Vision + Language and/or Other Modalities"
  },
  {
    "title": "CLID: Controlled-Length Image Descriptions with Limited Data",
    "base_url": "https://openaccess.thecvf.com/content/WACV2024",
    "title_page": "/html/Hirsch_CLID_Controlled-Length_Image_Descriptions_With_Limited_Data_WACV_2024_paper.html",
    "github": "Eladhi/CLID",
    "web_page": null,
    "github_page": null,
    "colab": null,
    "modelscope": null,
    "gitee": null,
    "gitlab": null,
    "zenodo": null,
    "kaggle": null,
    "demo_page": null,
    "paper_thecvf": "/papers/Hirsch_CLID_Controlled-Length_Image_Descriptions_With_Limited_Data_WACV_2024_paper.pdf",
    "paper_arxiv_id": "2211.14835",
    "paper_pdf": null,
    "paper_hal_science": null,
    "paper_researchgate": null,
    "paper_amazon": null,
    "youtube_id": "ca4jiim9D5g",
    "drive_google": null,
    "dropbox": null,
    "onedrive": null,
    "loom": null,
    "section": "Vision + Language and/or Other Modalities"
  },
  {
    "title": "STYLIP: Multi-Scale Style-Conditioned Prompt Learning for CLIP-based Domain Generalization",
    "base_url": "https://openaccess.thecvf.com/content/WACV2024",
    "title_page": "/html/Bose_STYLIP_Multi-Scale_Style-Conditioned_Prompt_Learning_for_CLIP-Based_Domain_Generalization_WACV_2024_paper.html",
    "github": null,
    "web_page": null,
    "github_page": null,
    "colab": null,
    "modelscope": null,
    "gitee": null,
    "gitlab": null,
    "zenodo": null,
    "kaggle": null,
    "demo_page": null,
    "paper_thecvf": "/papers/Bose_STYLIP_Multi-Scale_Style-Conditioned_Prompt_Learning_for_CLIP-Based_Domain_Generalization_WACV_2024_paper.pdf",
    "paper_arxiv_id": "2302.09251",
    "paper_pdf": null,
    "paper_hal_science": null,
    "paper_researchgate": null,
    "paper_amazon": null,
    "youtube_id": "NGQgE1q3wrs",
    "drive_google": null,
    "dropbox": null,
    "onedrive": null,
    "loom": null,
    "section": "Vision + Language and/or Other Modalities"
  },
  {
    "title": "THInImg: Cross-Modal Steganography for Presenting Talking Heads in Images",
    "base_url": "https://openaccess.thecvf.com/content/WACV2024",
    "title_page": "/html/Zhao_THInImg_Cross-Modal_Steganography_for_Presenting_Talking_Heads_in_Images_WACV_2024_paper.html",
    "github": null,
    "web_page": null,
    "github_page": null,
    "colab": null,
    "modelscope": null,
    "gitee": null,
    "gitlab": null,
    "zenodo": null,
    "kaggle": null,
    "demo_page": null,
    "paper_thecvf": "/papers/Zhao_THInImg_Cross-Modal_Steganography_for_Presenting_Talking_Heads_in_Images_WACV_2024_paper.pdf",
    "paper_arxiv_id": "2311.17177",
    "paper_pdf": null,
    "paper_hal_science": null,
    "paper_researchgate": null,
    "paper_amazon": null,
    "youtube_id": "YAFSDjDsxVM",
    "drive_google": null,
    "dropbox": null,
    "onedrive": null,
    "loom": null,
    "section": "Vision + Language and/or Other Modalities"
  },
  {
    "title": "Enhancing Multimodal Compositional Reasoning of Visual Language Models with Generative Negative Mining",
    "base_url": "https://openaccess.thecvf.com/content/WACV2024",
    "title_page": "/html/Sahin_Enhancing_Multimodal_Compositional_Reasoning_of_Visual_Language_Models_With_Generative_WACV_2024_paper.html",
    "github": "ugorsahin/Generative-Negative-Mining",
    "web_page": null,
    "github_page": "https://ugorsahin.github.io/enhancing-multimodal-compositional-reasoning-of-vlm.html",
    "colab": null,
    "modelscope": null,
    "gitee": null,
    "gitlab": null,
    "zenodo": null,
    "kaggle": null,
    "demo_page": null,
    "paper_thecvf": "/papers/Sahin_Enhancing_Multimodal_Compositional_Reasoning_of_Visual_Language_Models_With_Generative_WACV_2024_paper.pdf",
    "paper_arxiv_id": "2311.03964",
    "paper_pdf": null,
    "paper_hal_science": null,
    "paper_researchgate": null,
    "paper_amazon": null,
    "youtube_id": null,
    "drive_google": null,
    "dropbox": null,
    "onedrive": null,
    "loom": null,
    "section": "Vision + Language and/or Other Modalities"
  },
  {
    "title": "Temporal Context Enhanced Referring Video Object Segmentation",
    "base_url": "https://openaccess.thecvf.com/content/WACV2024",
    "title_page": "/html/Hu_Temporal_Context_Enhanced_Referring_Video_Object_Segmentation_WACV_2024_paper.html",
    "github": "haliphinx/TCE-RVOS",
    "web_page": null,
    "github_page": null,
    "colab": null,
    "modelscope": null,
    "gitee": null,
    "gitlab": null,
    "zenodo": null,
    "kaggle": null,
    "demo_page": null,
    "paper_thecvf": "/papers/Hu_Temporal_Context_Enhanced_Referring_Video_Object_Segmentation_WACV_2024_paper.pdf",
    "paper_arxiv_id": null,
    "paper_pdf": null,
    "paper_hal_science": null,
    "paper_researchgate": null,
    "paper_amazon": null,
    "youtube_id": "3YXG4kZiaZA",
    "drive_google": null,
    "dropbox": null,
    "onedrive": null,
    "loom": null,
    "section": "Vision + Language and/or Other Modalities"
  },
  {
    "title": "Fine-Grained Alignment for Cross-Modal Recipe Retrieval",
    "base_url": "https://openaccess.thecvf.com/content/WACV2024",
    "title_page": "/html/Wahed_Fine-Grained_Alignment_for_Cross-Modal_Recipe_Retrieval_WACV_2024_paper.html",
    "github": "PLAN-Lab/FARM",
    "web_page": null,
    "github_page": null,
    "colab": null,
    "modelscope": null,
    "gitee": null,
    "gitlab": null,
    "zenodo": null,
    "kaggle": null,
    "demo_page": null,
    "paper_thecvf": "/papers/Wahed_Fine-Grained_Alignment_for_Cross-Modal_Recipe_Retrieval_WACV_2024_paper.pdf",
    "paper_arxiv_id": null,
    "paper_pdf": null,
    "paper_hal_science": null,
    "paper_researchgate": null,
    "paper_amazon": null,
    "youtube_id": "1YrmDF6eYto",
    "drive_google": null,
    "dropbox": null,
    "onedrive": null,
    "loom": null,
    "section": "Vision + Language and/or Other Modalities"
  },
  {
    "title": "Learning to Adapt CLIP for Few-Shot Monocular Depth Estimation",
    "base_url": "https://openaccess.thecvf.com/content/WACV2024",
    "title_page": "/html/Hu_Learning_To_Adapt_CLIP_for_Few-Shot_Monocular_Depth_Estimation_WACV_2024_paper.html",
    "github": null,
    "web_page": null,
    "github_page": null,
    "colab": null,
    "modelscope": null,
    "gitee": null,
    "gitlab": null,
    "zenodo": null,
    "kaggle": null,
    "demo_page": null,
    "paper_thecvf": "/papers/Hu_Learning_To_Adapt_CLIP_for_Few-Shot_Monocular_Depth_Estimation_WACV_2024_paper.pdf",
    "paper_arxiv_id": "2311.01034",
    "paper_pdf": null,
    "paper_hal_science": null,
    "paper_researchgate": null,
    "paper_amazon": null,
    "youtube_id": "EA4M1vgQkFk",
    "drive_google": null,
    "dropbox": null,
    "onedrive": null,
    "loom": null,
    "section": "Vision + Language and/or Other Modalities"
  },
  {
    "title": "Annotation-Free Audio-Visual Segmentation",
    "base_url": "https://openaccess.thecvf.com/content/WACV2024",
    "title_page": "/html/Liu_Annotation-Free_Audio-Visual_Segmentation_WACV_2024_paper.html",
    "github": "jinxiang-liu/anno-free-AVS",
    "web_page": null,
    "github_page": "https://jinxiang-liu.github.io/anno-free-AVS/",
    "colab": null,
    "modelscope": null,
    "gitee": null,
    "gitlab": null,
    "zenodo": null,
    "kaggle": null,
    "demo_page": null,
    "paper_thecvf": "/papers/Liu_Annotation-Free_Audio-Visual_Segmentation_WACV_2024_paper.pdf",
    "paper_arxiv_id": "2305.11019",
    "paper_pdf": null,
    "paper_hal_science": null,
    "paper_researchgate": null,
    "paper_amazon": null,
    "youtube_id": "-FF_3SDOsaU",
    "drive_google": null,
    "dropbox": null,
    "onedrive": null,
    "loom": null,
    "section": "Vision + Language and/or Other Modalities"
  },
  {
    "title": "Rethink Cross-Modal Fusion in Weakly-Supervised Audio-Visual Video Parsing",
    "base_url": "https://openaccess.thecvf.com/content/WACV2024",
    "title_page": "/html/Xu_Rethink_Cross-Modal_Fusion_in_Weakly-Supervised_Audio-Visual_Video_Parsing_WACV_2024_paper.html",
    "github": null,
    "web_page": null,
    "github_page": null,
    "colab": null,
    "modelscope": null,
    "gitee": null,
    "gitlab": null,
    "zenodo": null,
    "kaggle": null,
    "demo_page": null,
    "paper_thecvf": "/papers/Xu_Rethink_Cross-Modal_Fusion_in_Weakly-Supervised_Audio-Visual_Video_Parsing_WACV_2024_paper.pdf",
    "paper_arxiv_id": "2311.08151",
    "paper_pdf": null,
    "paper_hal_science": null,
    "paper_researchgate": null,
    "paper_amazon": null,
    "youtube_id": "Qv2aPoUA_zQ",
    "drive_google": null,
    "dropbox": null,
    "onedrive": null,
    "loom": null,
    "section": "Vision + Language and/or Other Modalities"
  },
  {
    "title": "SDNet: An Extremely Efficient Portrait Matting Model via Self-Distillation",
    "base_url": "https://openaccess.thecvf.com/content/WACV2024",
    "title_page": "/html/Li_SDNet_An_Extremely_Efficient_Portrait_Matting_Model_via_Self-Distillation_WACV_2024_paper.html",
    "github": null,
    "web_page": null,
    "github_page": null,
    "colab": null,
    "modelscope": null,
    "gitee": null,
    "gitlab": null,
    "zenodo": null,
    "kaggle": null,
    "demo_page": null,
    "paper_thecvf": "/papers/Li_SDNet_An_Extremely_Efficient_Portrait_Matting_Model_via_Self-Distillation_WACV_2024_paper.pdf",
    "paper_arxiv_id": null,
    "paper_pdf": null,
    "paper_hal_science": null,
    "paper_researchgate": null,
    "paper_amazon": null,
    "youtube_id": "iZzG04VXUCc",
    "drive_google": null,
    "dropbox": null,
    "onedrive": null,
    "loom": null,
    "section": "Vision + Language and/or Other Modalities"
  },
  {
    "title": "FELGA: Unsupervised Fragment Embedding for Fine-Grained Cross-Modal Association",
    "base_url": "https://openaccess.thecvf.com/content/WACV2024",
    "title_page": "/html/Zhuo_FELGA_Unsupervised_Fragment_Embedding_for_Fine-Grained_Cross-Modal_Association_WACV_2024_paper.html",
    "github": null,
    "web_page": null,
    "github_page": null,
    "colab": null,
    "modelscope": null,
    "gitee": null,
    "gitlab": null,
    "zenodo": null,
    "kaggle": null,
    "demo_page": null,
    "paper_thecvf": "/papers/Zhuo_FELGA_Unsupervised_Fragment_Embedding_for_Fine-Grained_Cross-Modal_Association_WACV_2024_paper.pdf",
    "paper_arxiv_id": null,
    "paper_pdf": null,
    "paper_hal_science": null,
    "paper_researchgate": null,
    "paper_amazon": null,
    "youtube_id": "rs6rRc_o2Ok",
    "drive_google": null,
    "dropbox": null,
    "onedrive": null,
    "loom": null,
    "section": "Vision + Language and/or Other Modalities"
  },
  {
    "title": "Modality-Aware Representation Learning for Zero-Shot Sketch-based Image Retrieval",
    "base_url": "https://openaccess.thecvf.com/content/WACV2024",
    "title_page": "/html/Lyou_Modality-Aware_Representation_Learning_for_Zero-Shot_Sketch-Based_Image_Retrieval_WACV_2024_paper.html",
    "github": null,
    "web_page": null,
    "github_page": null,
    "colab": null,
    "modelscope": null,
    "gitee": null,
    "gitlab": null,
    "zenodo": null,
    "kaggle": null,
    "demo_page": null,
    "paper_thecvf": "/papers/Lyou_Modality-Aware_Representation_Learning_for_Zero-Shot_Sketch-Based_Image_Retrieval_WACV_2024_paper.pdf",
    "paper_arxiv_id": "2401.04860",
    "paper_pdf": null,
    "paper_hal_science": null,
    "paper_researchgate": null,
    "paper_amazon": null,
    "youtube_id": "5KsTetjNgCQ",
    "drive_google": null,
    "dropbox": null,
    "onedrive": null,
    "loom": null,
    "section": "Vision + Language and/or Other Modalities"
  },
  {
    "title": "Multitask Vision-Language Prompt Tuning",
    "base_url": "https://openaccess.thecvf.com/content/WACV2024",
    "title_page": "/html/Shen_Multitask_Vision-Language_Prompt_Tuning_WACV_2024_paper.html",
    "github": "sIncerass/MVLPT",
    "web_page": null,
    "github_page": null,
    "colab": null,
    "modelscope": null,
    "gitee": null,
    "gitlab": null,
    "zenodo": null,
    "kaggle": null,
    "demo_page": null,
    "paper_thecvf": "/papers/Shen_Multitask_Vision-Language_Prompt_Tuning_WACV_2024_paper.pdf",
    "paper_arxiv_id": "2211.11720",
    "paper_pdf": null,
    "paper_hal_science": null,
    "paper_researchgate": null,
    "paper_amazon": null,
    "youtube_id": "f9vjQxFBHU8",
    "drive_google": null,
    "dropbox": null,
    "onedrive": null,
    "loom": null,
    "section": "Vision + Language and/or Other Modalities"
  },
  {
    "title": "EASUM: Enhancing Affective State Understanding through Joint Sentiment and Emotion Modeling for Multimodal Tasks",
    "base_url": "https://openaccess.thecvf.com/content/WACV2024",
    "title_page": "/html/Hwang_EASUM_Enhancing_Affective_State_Understanding_Through_Joint_Sentiment_and_Emotion_WACV_2024_paper.html",
    "github": null,
    "web_page": null,
    "github_page": null,
    "colab": null,
    "modelscope": null,
    "gitee": null,
    "gitlab": null,
    "zenodo": null,
    "kaggle": null,
    "demo_page": null,
    "paper_thecvf": "/papers/Hwang_EASUM_Enhancing_Affective_State_Understanding_Through_Joint_Sentiment_and_Emotion_WACV_2024_paper.pdf",
    "paper_arxiv_id": null,
    "paper_pdf": null,
    "paper_hal_science": null,
    "paper_researchgate": null,
    "paper_amazon": null,
    "youtube_id": null,
    "drive_google": null,
    "dropbox": null,
    "onedrive": null,
    "loom": null,
    "section": "Vision + Language and/or Other Modalities"
  },
  {
    "title": "Complementary-Contradictory Feature Regularization Against Multimodal Overfitting",
    "base_url": "https://openaccess.thecvf.com/content/WACV2024",
    "title_page": "/html/Tejero-de-Pablos_Complementary-Contradictory_Feature_Regularization_Against_Multimodal_Overfitting_WACV_2024_paper.html",
    "github": "CyberAgentAILab/CM-VQVAE",
    "web_page": null,
    "github_page": null,
    "colab": null,
    "modelscope": null,
    "gitee": null,
    "gitlab": null,
    "zenodo": null,
    "kaggle": null,
    "demo_page": null,
    "paper_thecvf": "/papers/Tejero-de-Pablos_Complementary-Contradictory_Feature_Regularization_Against_Multimodal_Overfitting_WACV_2024_paper.pdf",
    "paper_arxiv_id": null,
    "paper_pdf": null,
    "paper_hal_science": null,
    "paper_researchgate": null,
    "paper_amazon": null,
    "youtube_id": "R7BiKXBa0ZY",
    "drive_google": null,
    "dropbox": null,
    "onedrive": null,
    "loom": null,
    "section": "Vision + Language and/or Other Modalities"
  },
  {
    "title": "FuseCap: Leveraging Large Language Models for Enriched Fused Image Captions",
    "base_url": "https://openaccess.thecvf.com/content/WACV2024",
    "title_page": "/html/Rotstein_FuseCap_Leveraging_Large_Language_Models_for_Enriched_Fused_Image_Captions_WACV_2024_paper.html",
    "github": null,
    "web_page": null,
    "github_page": null,
    "colab": null,
    "modelscope": null,
    "gitee": null,
    "gitlab": null,
    "zenodo": null,
    "kaggle": null,
    "demo_page": null,
    "paper_thecvf": "/papers/Rotstein_FuseCap_Leveraging_Large_Language_Models_for_Enriched_Fused_Image_Captions_WACV_2024_paper.pdf",
    "paper_arxiv_id": "2305.17718",
    "paper_pdf": null,
    "paper_hal_science": null,
    "paper_researchgate": null,
    "paper_amazon": null,
    "youtube_id": "xXUCTqmF_q4",
    "drive_google": null,
    "dropbox": null,
    "onedrive": null,
    "loom": null,
    "section": "Vision + Language and/or Other Modalities"
  },
  {
    "title": "Describe Images in a Boring Way: Towards Cross-Modal Sarcasm Generation",
    "base_url": "https://openaccess.thecvf.com/content/WACV2024",
    "title_page": "/html/Ruan_Describe_Images_in_a_Boring_Way_Towards_Cross-Modal_Sarcasm_Generation_WACV_2024_paper.html",
    "github": "EnablerRx/CMSG-EGRM",
    "web_page": null,
    "github_page": null,
    "colab": null,
    "modelscope": null,
    "gitee": null,
    "gitlab": null,
    "zenodo": null,
    "kaggle": null,
    "demo_page": null,
    "paper_thecvf": "/papers/Ruan_Describe_Images_in_a_Boring_Way_Towards_Cross-Modal_Sarcasm_Generation_WACV_2024_paper.pdf",
    "paper_arxiv_id": null,
    "paper_pdf": null,
    "paper_hal_science": null,
    "paper_researchgate": null,
    "paper_amazon": null,
    "youtube_id": "1Jcwau6VxVY",
    "drive_google": null,
    "dropbox": null,
    "onedrive": null,
    "loom": null,
    "section": "Vision + Language and/or Other Modalities"
  },
  {
    "title": "Can CLIP Help Sound Source Localization?",
    "base_url": "https://openaccess.thecvf.com/content/WACV2024",
    "title_page": "/html/Park_Can_CLIP_Help_Sound_Source_Localization_WACV_2024_paper.html",
    "github": "swimmiing/ACL-SSL",
    "web_page": null,
    "github_page": null,
    "colab": null,
    "modelscope": null,
    "gitee": null,
    "gitlab": null,
    "zenodo": null,
    "kaggle": null,
    "demo_page": "https://huggingface.co/spaces/swimmiing/ACL-SSL-zeroshot-demo",
    "paper_thecvf": "/papers/Park_Can_CLIP_Help_Sound_Source_Localization_WACV_2024_paper.pdf",
    "paper_arxiv_id": "2311.04066",
    "paper_pdf": null,
    "paper_hal_science": null,
    "paper_researchgate": null,
    "paper_amazon": null,
    "youtube_id": "hgxqz9ww4rU",
    "drive_google": null,
    "dropbox": null,
    "onedrive": null,
    "loom": null,
    "section": "Vision + Language and/or Other Modalities"
  },
  {
    "title": "Domain Aligned CLIP for Few-Shot Classification",
    "base_url": "https://openaccess.thecvf.com/content/WACV2024",
    "title_page": "/html/Gondal_Domain_Aligned_CLIP_for_Few-Shot_Classification_WACV_2024_paper.html",
    "github": null,
    "web_page": null,
    "github_page": null,
    "colab": null,
    "modelscope": null,
    "gitee": null,
    "gitlab": null,
    "zenodo": null,
    "kaggle": null,
    "demo_page": null,
    "paper_thecvf": "/papers/Gondal_Domain_Aligned_CLIP_for_Few-Shot_Classification_WACV_2024_paper.pdf",
    "paper_arxiv_id": "2311.09191",
    "paper_pdf": null,
    "paper_hal_science": null,
    "paper_researchgate": null,
    "paper_amazon": null,
    "youtube_id": "3sw6UWcSyNI",
    "drive_google": null,
    "dropbox": null,
    "onedrive": null,
    "loom": null,
    "section": "Vision + Language and/or Other Modalities"
  },
  {
    "title": "SCoRD: Subject-Conditional Relation Detection with Text-Augmented Data",
    "base_url": "https://openaccess.thecvf.com/content/WACV2024",
    "title_page": "/html/Yang_SCoRD_Subject-Conditional_Relation_Detection_With_Text-Augmented_Data_WACV_2024_paper.html",
    "github": "uvavision/SCoRD",
    "web_page": null,
    "github_page": null,
    "colab": null,
    "modelscope": null,
    "gitee": null,
    "gitlab": null,
    "zenodo": null,
    "kaggle": null,
    "demo_page": null,
    "paper_thecvf": "/papers/Yang_SCoRD_Subject-Conditional_Relation_Detection_With_Text-Augmented_Data_WACV_2024_paper.pdf",
    "paper_arxiv_id": "2308.12910",
    "paper_pdf": null,
    "paper_hal_science": null,
    "paper_researchgate": null,
    "paper_amazon": null,
    "youtube_id": null,
    "drive_google": null,
    "dropbox": null,
    "onedrive": null,
    "loom": null,
    "section": "Vision + Language and/or Other Modalities"
  },
  {
    "title": "Simple Token-Level Confidence Improves Caption Correctness",
    "base_url": "https://openaccess.thecvf.com/content/WACV2024",
    "title_page": "/html/Petryk_Simple_Token-Level_Confidence_Improves_Caption_Correctness_WACV_2024_paper.html",
    "github": null,
    "web_page": null,
    "github_page": null,
    "colab": null,
    "modelscope": null,
    "gitee": null,
    "gitlab": null,
    "zenodo": null,
    "kaggle": null,
    "demo_page": null,
    "paper_thecvf": "/papers/Petryk_Simple_Token-Level_Confidence_Improves_Caption_Correctness_WACV_2024_paper.pdf",
    "paper_arxiv_id": "2305.07021",
    "paper_pdf": null,
    "paper_hal_science": null,
    "paper_researchgate": null,
    "paper_amazon": null,
    "youtube_id": "iDq7V4pMgTo",
    "drive_google": null,
    "dropbox": null,
    "onedrive": null,
    "loom": null,
    "section": "Vision + Language and/or Other Modalities"
  },
  {
    "title": "Bi-Directional Training for Composed Image Retrieval via Text Prompt Learning",
    "base_url": "https://openaccess.thecvf.com/content/WACV2024",
    "title_page": "/html/Liu_Bi-Directional_Training_for_Composed_Image_Retrieval_via_Text_Prompt_Learning_WACV_2024_paper.html",
    "github": "Cuberick-Orion/Bi-Blip4CIR",
    "web_page": null,
    "github_page": null,
    "colab": null,
    "modelscope": null,
    "gitee": null,
    "gitlab": null,
    "zenodo": null,
    "kaggle": null,
    "demo_page": null,
    "paper_thecvf": "/papers/Liu_Bi-Directional_Training_for_Composed_Image_Retrieval_via_Text_Prompt_Learning_WACV_2024_paper.pdf",
    "paper_arxiv_id": "2303.16604",
    "paper_pdf": null,
    "paper_hal_science": null,
    "paper_researchgate": null,
    "paper_amazon": null,
    "youtube_id": "kCYzsmekweg",
    "drive_google": null,
    "dropbox": null,
    "onedrive": null,
    "loom": null,
    "section": "Vision + Language and/or Other Modalities"
  },
  {
    "title": "MOPA: Modular Object Navigation with PointGoal Agents",
    "base_url": "https://openaccess.thecvf.com/content/WACV2024",
    "title_page": "/html/Raychaudhuri_MOPA_Modular_Object_Navigation_With_PointGoal_Agents_WACV_2024_paper.html",
    "github": "3dlg-hcvc/mopa",
    "web_page": null,
    "github_page": "https://3dlg-hcvc.github.io/mopa/",
    "colab": null,
    "modelscope": null,
    "gitee": null,
    "gitlab": null,
    "zenodo": null,
    "kaggle": null,
    "demo_page": null,
    "paper_thecvf": "/papers/Raychaudhuri_MOPA_Modular_Object_Navigation_With_PointGoal_Agents_WACV_2024_paper.pdf",
    "paper_arxiv_id": "2304.03696",
    "paper_pdf": null,
    "paper_hal_science": null,
    "paper_researchgate": null,
    "paper_amazon": null,
    "youtube_id": "Jcspov0UpsA",
    "drive_google": null,
    "dropbox": null,
    "onedrive": null,
    "loom": null,
    "section": "Vision + Language and/or Other Modalities"
  },
  {
    "title": "GIPCOL: Graph-Injected Soft Prompting for Compositional Zero-Shot Learning",
    "base_url": "https://openaccess.thecvf.com/content/WACV2024",
    "title_page": "/html/Xu_GIPCOL_Graph-Injected_Soft_Prompting_for_Compositional_Zero-Shot_Learning_WACV_2024_paper.html",
    "github": "HLR/GIPCOL",
    "web_page": null,
    "github_page": null,
    "colab": null,
    "modelscope": null,
    "gitee": null,
    "gitlab": null,
    "zenodo": null,
    "kaggle": null,
    "demo_page": null,
    "paper_thecvf": "/papers/Xu_GIPCOL_Graph-Injected_Soft_Prompting_for_Compositional_Zero-Shot_Learning_WACV_2024_paper.pdf",
    "paper_arxiv_id": "2311.05729",
    "paper_pdf": null,
    "paper_hal_science": null,
    "paper_researchgate": null,
    "paper_amazon": null,
    "youtube_id": "SISClycr5hg",
    "drive_google": null,
    "dropbox": null,
    "onedrive": null,
    "loom": null,
    "section": "Vision + Language and/or Other Modalities"
  },
  {
    "title": "Text-Guided Face Recognition using Multi-Granularity Cross-Modal Contrastive Learning",
    "base_url": "https://openaccess.thecvf.com/content/WACV2024",
    "title_page": "/html/Hasan_Text-Guided_Face_Recognition_Using_Multi-Granularity_Cross-Modal_Contrastive_Learning_WACV_2024_paper.html",
    "github": null,
    "web_page": null,
    "github_page": null,
    "colab": null,
    "modelscope": null,
    "gitee": null,
    "gitlab": null,
    "zenodo": null,
    "kaggle": null,
    "demo_page": null,
    "paper_thecvf": "/papers/Hasan_Text-Guided_Face_Recognition_Using_Multi-Granularity_Cross-Modal_Contrastive_Learning_WACV_2024_paper.pdf",
    "paper_arxiv_id": "2312.09367",
    "paper_pdf": null,
    "paper_hal_science": null,
    "paper_researchgate": null,
    "paper_amazon": null,
    "youtube_id": "ZilneI4BDNU",
    "drive_google": null,
    "dropbox": null,
    "onedrive": null,
    "loom": null,
    "section": "Vision + Language and/or Other Modalities"
  },
  {
    "title": "Leveraging Task-Specific Pre-Training to Reason Across Images and Videos",
    "base_url": "https://openaccess.thecvf.com/content/WACV2024",
    "title_page": "/html/Sadhu_Leveraging_Task-Specific_Pre-Training_To_Reason_Across_Images_and_Videos_WACV_2024_paper.html",
    "github": null,
    "web_page": null,
    "github_page": null,
    "colab": null,
    "modelscope": null,
    "gitee": null,
    "gitlab": null,
    "zenodo": null,
    "kaggle": null,
    "demo_page": null,
    "paper_thecvf": "/papers/Sadhu_Leveraging_Task-Specific_Pre-Training_To_Reason_Across_Images_and_Videos_WACV_2024_paper.pdf",
    "paper_arxiv_id": null,
    "paper_pdf": null,
    "paper_hal_science": null,
    "paper_researchgate": null,
    "paper_amazon": null,
    "youtube_id": "PZSJhrNnKAo",
    "drive_google": null,
    "dropbox": null,
    "onedrive": null,
    "loom": null,
    "section": "Vision + Language and/or Other Modalities"
  },
  {
    "title": "VD-GR: Boosting Visual Dialog with Cascaded Spatial-Temporal Multi-Modal Graphs",
    "base_url": "https://openaccess.thecvf.com/content/WACV2024",
    "title_page": "/html/Abdessaied_VD-GR_Boosting_Visual_Dialog_With_Cascaded_Spatial-Temporal_Multi-Modal_Graphs_WACV_2024_paper.html",
    "github": null,
    "web_page": "https://perceptualui.org/publications/abdessaied24_wacv/",
    "github_page": null,
    "colab": null,
    "modelscope": null,
    "gitee": null,
    "gitlab": null,
    "zenodo": null,
    "kaggle": null,
    "demo_page": null,
    "paper_thecvf": "/papers/Abdessaied_VD-GR_Boosting_Visual_Dialog_With_Cascaded_Spatial-Temporal_Multi-Modal_Graphs_WACV_2024_paper.pdf",
    "paper_arxiv_id": null,
    "paper_pdf": null,
    "paper_hal_science": null,
    "paper_researchgate": null,
    "paper_amazon": null,
    "youtube_id": "8JLE_2lGjjw",
    "drive_google": null,
    "dropbox": null,
    "onedrive": null,
    "loom": null,
    "section": "Vision + Language and/or Other Modalities"
  },
  {
    "title": "TriCoLo: Trimodal Contrastive Loss for Text to Shape Retrieval",
    "base_url": "https://openaccess.thecvf.com/content/WACV2024",
    "title_page": "/html/Ruan_TriCoLo_Trimodal_Contrastive_Loss_for_Text_To_Shape_Retrieval_WACV_2024_paper.html",
    "github": "3dlg-hcvc/tricolo",
    "web_page": null,
    "github_page": "https://3dlg-hcvc.github.io/tricolo/",
    "colab": null,
    "modelscope": null,
    "gitee": null,
    "gitlab": null,
    "zenodo": null,
    "kaggle": null,
    "demo_page": null,
    "paper_thecvf": "/papers/Ruan_TriCoLo_Trimodal_Contrastive_Loss_for_Text_To_Shape_Retrieval_WACV_2024_paper.pdf",
    "paper_arxiv_id": "2201.07366",
    "paper_pdf": null,
    "paper_hal_science": null,
    "paper_researchgate": null,
    "paper_amazon": null,
    "youtube_id": "4YK65qDUUJs",
    "drive_google": null,
    "dropbox": null,
    "onedrive": null,
    "loom": null,
    "section": "Vision + Language and/or Other Modalities"
  }
]